suppressWarnings(suppressMessages(library(completejourney)))
suppressWarnings(suppressMessages(library(tidyverse)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(ggplot2)))
x <- get_transactions()
x
p <- products
p
coup <- coupons
coup
coup_red <- coupon_redemptions
coup_red
dem <- demographics
dem
tran_prod <- x %>%
left_join(p, by = 'product_id')
tran_prod
tran_by_product <- tran_prod %>%
group_by(product_id, product_type,department) %>%
summarise(total_sales = sum(sales_value), .groups = 'drop') %>%
arrange(desc(total_sales))
tran_by_product
filt_data3<-filter(tran_prod, product_id==1029743) # milk
filt_prod_2<-filter(tran_prod, product_id==1082185) # bananas
filt_prod_3<-filter(tran_prod, product_id==981760) # eggs
only_milk<-filt_data3 %>%
left_join(demographics, by = "household_id") # gettings ages and incomes
only_milk_noNA <- na.omit(only_milk)
milk_baskets <- filt_data3$basket_id
# Find the total number of basket IDs in filt_data3
baskets_with_milk_only <- n_distinct(filt_data3$basket_id)
# Find baskets that contain both milk and bananas (product_id == 1082185)
bananas_with_milk <- n_distinct(filter(filt_prod_2, basket_id %in% milk_baskets)$basket_id)
# Find baskets that contain milk, bananas, and eggs (product_id == 981760)
eggs_with_milk_and_bananas <- n_distinct(filter(filt_prod_3, basket_id %in% milk_baskets)$basket_id)
# Print the results
cat("Baskets buying Milk Only:", baskets_with_milk_only, "\n")
## Baskets buying Milk Only: 7874
cat("Baskets buying Milk and Bananas:", bananas_with_milk, "\n")
## Baskets buying Milk and Bananas: 1909
cat("Baskets buiying Milk, Bananas, and Eggs:", eggs_with_milk_and_bananas, "\n")
## Baskets buiying Milk, Bananas, and Eggs: 812
basket_data <- data.frame(
Category = c("Baskets with Milk and Bananas", "Baskets with Milk, Bananas, and Eggs", "Baskets with Milk Only"),
Count = c(bananas_with_milk, eggs_with_milk_and_bananas, baskets_with_milk_only)
)
# Calculate the share
basket_data$Share <- basket_data$Count / sum(basket_data$Count)
# Create a pie chart
pie_chart <- ggplot(basket_data, aes(x = "", y = Share, fill = Category)) +
geom_bar(stat = "identity", width = 1) +
coord_polar(theta = "y") +
labs(title = "Basket Composition") +
theme_void() +
scale_fill_brewer(palette = "Set3") +
geom_text(aes(label = scales::percent(Share)), position = position_stack(vjust = 0.5))
# Display the pie chart
print(pie_chart)
ggplot(only_milk_noNA, aes(x = income, y = sales_value, fill = income)) +
geom_bar(stat = "identity") +
labs(title = "Sales Value by Income Range (Fluid Milk White Only)",
x = "Income Range",
y = "Sales Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
filt_data3 <- filter(tran_prod, product_id == 1029743) # Fluid Milk White Only
only_milk <- filt_data3 %>%
left_join(demographics, by = "household_id") # getting ages and incomes
only_milk_noNA <- na.omit(only_milk)
age_group_spending <- only_milk_noNA %>%
group_by(age) %>%
summarise(total_spending = sum(sales_value)) %>%
arrange(desc(total_spending))
age_group_spending
ggplot(age_group_spending, aes(x = age, y = total_spending, fill = factor(age))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Total Spending by Age Group (Fluid Milk White Only)",
x = "Age Group",
y = "Total Sales Value") +
scale_fill_brewer(palette = "Set3") + # Choose a color palette (you can change "Set3" to any other valid palette)
theme_minimal() # Apply a minimal theme for cleaner look
age_group_45_54 <- only_milk_noNA %>%
filter(age == "45-54")
income_range_spending <- age_group_45_54 %>%
group_by(income) %>%
summarise(total_spending = sum(sales_value)) %>%
arrange(desc(total_spending))
print(income_range_spending)
## # A tibble: 11 × 2
## income total_spending
## <ord> <dbl>
## 1 50-74K 887.
## 2 75-99K 510.
## 3 125-149K 333.
## 4 35-49K 284.
## 5 25-34K 273.
## 6 Under 15K 176.
## 7 15-24K 170.
## 8 150-174K 108
## 9 100-124K 98.7
## 10 250K+ 56.7
## 11 175-199K 10.8
ggplot(age_group_45_54, aes(x = income, y = sales_value, fill = income)) +
geom_bar(stat = "identity") +
labs(title = "Total Sales Value by Income Range for 45-54 age group",
x = "Income Range",
y = "Total Sales Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
age_group_35_44 <- only_milk_noNA %>%
filter(age == "35-44")
income_range_spending <- age_group_35_44 %>%
group_by(income) %>%
summarise(total_spending = sum(sales_value)) %>%
arrange(desc(total_spending))
print(income_range_spending)
## # A tibble: 12 × 2
## income total_spending
## <ord> <dbl>
## 1 50-74K 597.
## 2 35-49K 564.
## 3 75-99K 437.
## 4 150-174K 261.
## 5 125-149K 214.
## 6 Under 15K 162.
## 7 15-24K 157.
## 8 25-34K 73.0
## 9 100-124K 51.9
## 10 200-249K 29.8
## 11 175-199K 29.4
## 12 250K+ 28.7
ggplot(age_group_35_44, aes(x = income, y = sales_value, fill = income)) +
geom_bar(stat = "identity") +
labs(title = "Total Sales Value by Income Range for 35-44 Age Group",
x = "Income Range",
y = "Total Sales Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(only_milk_noNA, aes(x = household_size, y = sales_value, fill = household_size)) +
geom_bar(stat = "identity") +
labs(title = "Sales Value by Household Size (Fluid Milk White Only)",
x = "Household size",
y = "Sales Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
weekly_sales <- only_milk_noNA %>%
group_by(week) %>%
summarise(total_sales = sum(sales_value))
p <- ggplot(weekly_sales, aes(x = factor(week), y = total_sales)) +
geom_line(aes(group = 1), size = 1.5) +
labs(title = "Weekly Sales Trend for Fluid Milk White Only",
x = "Week",
y = "Total Sales Value") +
scale_x_discrete(breaks = seq(min(weekly_sales$week), max(weekly_sales$week), by = 4))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Identify the top 3 peaks
top_peaks <- weekly_sales %>%
arrange(desc(total_sales)) %>%
head(3)
# Add points for the top 3 peaks
p + geom_point(data = top_peaks, aes(x = factor(week), y = total_sales), color = "red", size = 3) +
geom_text(data = top_peaks, aes(x = factor(week), y = total_sales, label = week),
vjust = -0.5, hjust = 0.5, color = "red", size = 3)